In [ ]:
from IPython.core.display import display, HTML, Javascript

html_contents ="""
<!DOCTYPE html>
<html lang="en">
    <head>
    
    <style>
    
.section2{
    Font-Family: 'Playfair Display', Serif;
    font-style: italic;
    font-weight: bold;
    background: #D9D874;
    color:black;
    text-align: center;
    font-size:30px;
    }
    
.section{
    Font-Family: 'Source Sans Pro', Sans-Serif;
    font-weight: bold;
    font-size:20px;
    color:#a02933;
}

.intro{
    Font-Family: 'Playfair Display', Serif;
    text-align:center;
    font-size:15px;
    color:#4e4151;
    letter-spacing:0.5px;
    
    display:fill;
    background-color:#dbba78;
    padding: 0.8em;
    
    border-style: solid;
    border-color:#4e4151;
    border-radius:10px;     
}

.final{
    Font-Family: 'Playfair Display', Serif;
    text-align:center;
    font-size:15px;
    color:#4e4151;
    letter-spacing:0.5px;
    
    display:fill;
    background-color:#dbba78;
    padding: 0.8em;
    
    border-style: dashed;
    border-color:#4e4151;
    border-radius:10px;     
}

mark {
  Font-Family: 'Playfair Display', Serif;
  font-style: italic;
  background: #a02933;
  font-weight: bold;
  color: #dbba78;
  text-align: center;
  font-size:25px;
  justify-content: center;
}

.intro2 {
       padding: 1rem;
       width: 600px;
       0 5px 15px 0 rgba(0,0,0,0.08);
       border-color:#4e4151;
       border-radius:5px;  
       border-style: dashed;
       background-color:#dbba78;
       font-size:15px;
       font-family: 'Playfair Display', Serif;
       font-weight: none;
       letter-spacing:0.5px;
       text-align: left;
       border-width: 2px;
       color: #4e4151;
  
}

hr { border: 0.2px solid #a02933;
           }
    
    
    </style>
    </head>
    
</html>
"""

HTML(html_contents)
Out[ ]:
In [ ]:
from IPython.core.display import display, HTML, Javascript

# ----- Notebook Theme -----
color_map  = ["#bfd3e6", "#9b5b4f", "#4e4151", "#78ccdb", "#E9C9C9", "#909195","#dc1e1e","#a02933","#716807","#E9C9C9"]

prompt = color_map[-1]
main_color = color_map[3]
strong_main_color = color_map[1]
custom_colors = [strong_main_color, main_color]

css_file = ''' 

div #notebook {
background-color: white;
line-height: 20px;
}

#notebook-container {
%s
margin-top: 2em;
padding-top: 2em;
border-top: 4px solid %s; /* light orange */
-webkit-box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
    box-shadow: 0px 0px 8px 2px rgba(224, 212, 226, 0.5); /* pink */
}

div .input {
margin-bottom: 1em;
}

.rendered_html h1, .rendered_html h2, .rendered_html h3, .rendered_html h4, .rendered_html h5, .rendered_html h6 {
color: %s; /* light orange */
font-weight: 600;
}

div.input_area {
border: none;
    background-color: %s; /* rgba(229, 143, 101, 0.1); light orange [exactly #E58F65] */
    border-top: 2px solid %s; /* light orange */
}

div.input_prompt {
color: %s; /* light blue */
}

div.output_prompt {
color: %s; /* strong orange */
}

div.cell.selected:before, div.cell.selected.jupyter-soft-selected:before {
background: %s; /* light orange */
}

div.cell.selected, div.cell.selected.jupyter-soft-selected {
    border-color: %s; /* light orange */
}

.edit_mode div.cell.selected:before {
background: %s; /* light orange */
}

.edit_mode div.cell.selected {
border-color: %s; /* light orange */

}
'''
def to_rgb(h): 
    return tuple(int(h[i:i+2], 16) for i in [0, 2, 4])

main_color_rgba = 'rgba(%s, %s, %s, 0.1)' % (to_rgb(main_color[1:]))
open('notebook.css', 'w').write(css_file % ('width: 95%;', main_color, main_color, main_color_rgba, main_color,  main_color, prompt, main_color, main_color, main_color, main_color))

def nb(): 
    return HTML("<style>" + open("notebook.css", "r").read() + "</style>")
nb()
Out[ ]:
In [ ]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# linear algebra
import numpy as np 
import pandas as pd

#viz
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mlb
import matplotlib.image as mpimg
from matplotlib.offsetbox import AnnotationBbox, OffsetImage

from IPython.display import display
import os
plt.style.use("seaborn")
%matplotlib inline  
import warnings
warnings.filterwarnings("ignore")

#theme
theme = ["#DEF5E5", "#459E97", "#4B9ABB","#0A4361","#E68193",]

📕IMPORTING_LIBRARIES

In [ ]:
df=pd.read_csv("../input/medias-cost-prediction-in-foodmart/media prediction and its cost.csv")
df.head()
Out[ ]:
food_category food_department food_family store_sales(in millions) store_cost(in millions) unit_sales(in millions) promotion_name sales_country marital_status gender ... grocery_sqft frozen_sqft meat_sqft coffee_bar video_store salad_bar prepared_food florist media_type cost
0 Breakfast Foods Frozen Foods Food 7.360 2.723 4.000 Bag Stuffers USA M F ... 18670.000 5415.000 3610.000 1.000 1.000 1.000 1.000 1.000 Daily Paper, Radio 126.620
1 Breakfast Foods Frozen Foods Food 5.520 2.594 3.000 Cash Register Lottery USA M M ... 18670.000 5415.000 3610.000 1.000 1.000 1.000 1.000 1.000 Daily Paper, Radio 59.860
2 Breakfast Foods Frozen Foods Food 3.680 1.362 2.000 High Roller Savings USA S F ... 18670.000 5415.000 3610.000 1.000 1.000 1.000 1.000 1.000 Daily Paper, Radio 84.160
3 Breakfast Foods Frozen Foods Food 3.680 1.178 2.000 Cash Register Lottery USA M F ... 18670.000 5415.000 3610.000 1.000 1.000 1.000 1.000 1.000 In-Store Coupon 95.780
4 Breakfast Foods Frozen Foods Food 4.080 1.428 3.000 Double Down Sale USA M M ... 18670.000 5415.000 3610.000 1.000 1.000 1.000 1.000 1.000 Radio 50.790

5 rows × 40 columns

In [ ]:
#describe the data with some visual style by applying T = Transpose and st#this gives us some general information on the dataset
#it shows us the count of each value, the mean of each value, standard deviation, percentiles and min/max valuesyle
df.describe().T.style.bar(subset=['mean'], color='#E68193')\
                            .background_gradient(subset=['std'], cmap='mako_r')\
                             .background_gradient(subset=['50%'], cmap='mako') 
Out[ ]:
  count mean std min 25% 50% 75% max
store_sales(in millions) 60428.000000 6.541031 3.463047 0.510000 3.810000 5.940000 8.670000 22.920000
store_cost(in millions) 60428.000000 2.619460 1.453009 0.163200 1.500000 2.385600 3.484025 9.726500
unit_sales(in millions) 60428.000000 3.093169 0.827677 1.000000 3.000000 3.000000 4.000000 6.000000
total_children 60428.000000 2.533875 1.490165 0.000000 1.000000 3.000000 4.000000 5.000000
avg_cars_at home(approx) 60428.000000 2.200271 1.109644 0.000000 1.000000 2.000000 3.000000 4.000000
num_children_at_home 60428.000000 0.829351 1.303424 0.000000 0.000000 0.000000 1.000000 5.000000
avg_cars_at home(approx).1 60428.000000 2.200271 1.109644 0.000000 1.000000 2.000000 3.000000 4.000000
SRP 60428.000000 2.115258 0.932829 0.500000 1.410000 2.130000 2.790000 3.980000
gross_weight 60428.000000 13.806433 4.622693 6.000000 9.700000 13.600000 17.700000 21.900000
net_weight 60428.000000 11.796289 4.682986 3.050000 7.710000 11.600000 16.000000 20.800000
recyclable_package 60428.000000 0.558665 0.496551 0.000000 0.000000 1.000000 1.000000 1.000000
low_fat 60428.000000 0.350434 0.477110 0.000000 0.000000 0.000000 1.000000 1.000000
units_per_case 60428.000000 18.860694 10.258555 1.000000 10.000000 19.000000 28.000000 36.000000
store_sqft 60428.000000 27988.477494 5701.022090 20319.000000 23593.000000 27694.000000 30797.000000 39696.000000
grocery_sqft 60428.000000 19133.799696 3987.395735 13305.000000 16232.000000 18670.000000 22123.000000 30351.000000
frozen_sqft 60428.000000 5312.852552 1575.907263 2452.000000 4746.000000 5062.000000 5751.000000 9184.000000
meat_sqft 60428.000000 3541.846280 1050.471635 1635.000000 3164.000000 3375.000000 3834.000000 6122.000000
coffee_bar 60428.000000 0.612646 0.487150 0.000000 0.000000 1.000000 1.000000 1.000000
video_store 60428.000000 0.354157 0.478261 0.000000 0.000000 0.000000 1.000000 1.000000
salad_bar 60428.000000 0.587956 0.492207 0.000000 0.000000 1.000000 1.000000 1.000000
prepared_food 60428.000000 0.587956 0.492207 0.000000 0.000000 1.000000 1.000000 1.000000
florist 60428.000000 0.562603 0.496069 0.000000 0.000000 1.000000 1.000000 1.000000
cost 60428.000000 99.262366 30.011257 50.790000 69.650000 98.520000 126.620000 149.750000
In [ ]:
sns.set(style="ticks", context="talk",font_scale = 1,palette="husl")
plt.figure(figsize = (8,6))
ax = df.dtypes.value_counts().plot(kind='bar',grid = False,fontsize=20)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+ p.get_width() / 2., height + 0.2, height, ha = 'center', size = 25)
sns.despine()

🧹DATA_CLEANING_

In [ ]:
msno.bar(df,figsize=(15, 5),fontsize=10,color = '#459E97');

as we see that the data is cleaned so we donn have to do any terminologies to handle it

In [ ]:
#checking missing values
missing = df.isnull().sum()
missing = missing[missing > 0]
missing
Out[ ]:
Series([], dtype: int64)
In [ ]:
columns = df.columns
print("******************* Numeric field *******************\n")
for i in range(len(columns)):
    if df[columns[i]].dtypes!=object:
        print("unique number of {} -> {}".format(columns[i], len(df[columns[i]].unique())))
        
print("\n******************* Categorical field *******************\n")
for i in range(len(columns)):
    if df[columns[i]].dtypes==object:
        print("unique number of {} -> {}".format(columns[i], len(df[columns[i]].unique())))
******************* Numeric field *******************

unique number of store_sales(in millions) -> 1033
unique number of store_cost(in millions) -> 9919
unique number of unit_sales(in millions) -> 6
unique number of total_children -> 6
unique number of avg_cars_at home(approx) -> 5
unique number of num_children_at_home -> 6
unique number of avg_cars_at home(approx).1 -> 5
unique number of SRP -> 315
unique number of gross_weight -> 376
unique number of net_weight -> 332
unique number of recyclable_package -> 2
unique number of low_fat -> 2
unique number of units_per_case -> 36
unique number of store_sqft -> 20
unique number of grocery_sqft -> 20
unique number of frozen_sqft -> 20
unique number of meat_sqft -> 20
unique number of coffee_bar -> 2
unique number of video_store -> 2
unique number of salad_bar -> 2
unique number of prepared_food -> 2
unique number of florist -> 2
unique number of cost -> 328

******************* Categorical field *******************

unique number of food_category -> 45
unique number of food_department -> 22
unique number of food_family -> 3
unique number of promotion_name -> 49
unique number of sales_country -> 3
unique number of marital_status -> 2
unique number of gender -> 2
unique number of education -> 5
unique number of member_card -> 4
unique number of occupation -> 5
unique number of houseowner -> 2
unique number of avg. yearly_income -> 8
unique number of brand_name -> 111
unique number of store_type -> 5
unique number of store_city -> 19
unique number of store_state -> 10
unique number of media_type -> 13
In [ ]:
unique_values = {column: list(df[column].unique()) for column in df.select_dtypes(object).columns}
for key, value in unique_values.items():
    print(f'{key} : {value}')
food_category : ['Breakfast Foods', 'Bread', 'Canned Shrimp', 'Baking Goods', 'Vegetables', 'Frozen Desserts', 'Candy', 'Snack Foods', 'Dairy', 'Starchy Foods', 'Cleaning Supplies', 'Decongestants', 'Meat', 'Hot Beverages', 'Jams and Jellies', 'Carbonated Beverages', 'Seafood', 'Specialty', 'Kitchen Products', 'Electrical', 'Beer and Wine', 'Candles', 'Fruit', 'Pure Juice Beverages', 'Canned Soup', 'Paper Products', 'Canned Tuna', 'Eggs', 'Hardware', 'Canned Sardines', 'Canned Clams', 'Pain Relievers', 'Side Dishes', 'Bathroom Products', 'Magazines', 'Frozen Entrees', 'Pizza', 'Cold Remedies', 'Canned Anchovies', 'Drinks', 'Hygiene', 'Plastic Products', 'Canned Oysters', 'Packaged Vegetables', 'Miscellaneous']
food_department : ['Frozen Foods', 'Baked Goods', 'Canned Foods', 'Baking Goods', 'Produce', 'Snacks', 'Snack Foods', 'Dairy', 'Starchy Foods', 'Household', 'Health and Hygiene', 'Meat', 'Beverages', 'Seafood', 'Deli', 'Alcoholic Beverages', 'Canned Products', 'Eggs', 'Periodicals', 'Breakfast Foods', 'Checkout', 'Carousel']
food_family : ['Food', 'Non-Consumable', 'Drink']
promotion_name : ['Bag Stuffers', 'Cash Register Lottery', 'High Roller Savings', 'Double Down Sale', 'Green Light Days', 'Big Time Savings', 'Price Savers', 'Price Slashers', 'Dollar Days', 'Two Day Sale', 'Super Duper Savers', 'Weekend Markdown', 'Dollar Cutters', 'Sales Galore', 'Big Promo', 'Free For All', 'Savings Galore', 'Unbeatable Price Savers', 'Price Smashers', 'Shelf Clearing Days', 'Sales Days', 'Go For It', 'Super Savers', 'Wallet Savers', 'Save-It Sale', 'Price Destroyers', 'Two for One', 'Big Time Discounts', 'Shelf Emptiers', 'Bye Bye Baby', 'One Day Sale', 'Fantastic Discounts', 'Saving Days', 'You Save Days', 'Sale Winners', 'Coupon Spectacular', 'Three for One', 'Price Winners', 'Dimes Off', 'I Cant Believe It Sale', 'Money Savers', 'Green Light Special', 'Price Cutters', 'Tip Top Savings', 'Best Savings', 'Pick Your Savings', 'Double Your Savings', 'Mystery Sale', 'Super Wallet Savers']
sales_country : ['USA', 'Mexico', 'Canada']
marital_status : ['M', 'S']
gender : ['F', 'M']
education : ['Partial High School', 'Bachelors Degree', 'High School Degree', 'Graduate Degree', 'Partial College']
member_card : ['Normal', 'Silver', 'Bronze', 'Golden']
occupation : ['Skilled Manual', 'Professional', 'Manual', 'Management', 'Clerical']
houseowner : ['Y', 'N']
avg. yearly_income : ['$10K - $30K', '$50K - $70K', '$30K - $50K', '$70K - $90K', '$110K - $130K', '$130K - $150K', '$150K +', '$90K - $110K']
brand_name : ['Carrington', 'Golden', 'Imagine', 'Big Time', 'PigTail', 'Fantastic', 'Great', 'Sphinx', 'Modell', 'Colony', 'Blue Label', 'Pleasant', 'Bravo', 'Better', 'Just Right', 'Plato', 'BBB Best', 'Landslide', 'Super', 'CDR', 'High Top', 'Tri-State', 'Hermanos', 'Tell Tale', 'Ebony', 'Thresher', 'Gulf Coast', 'Musial', 'Atomic', 'Choice', 'Fort West', 'Nationeel', 'Horatio', 'Best Choice', 'Fast', 'Gorilla', 'Carlson', 'Even Better', 'Club', 'Booker', 'Shady Lake', 'Monarch', 'Discover', 'Colossal', 'Medalist', 'Jardon', 'Cormorant', 'Sunset', 'Red Wing', 'High Quality', 'Denny', 'Faux Products', 'Steady', 'Consolidated', 'Bird Call', 'Hilltop', 'Ship Shape', 'Footnote', 'Genteel', 'Quick', 'Gerolli', 'Excellent', 'Fabulous', 'Token', 'Skinner', 'Washington', 'Dual City', 'Kiwi', 'Tip Top', 'Amigo', 'Curlew', 'Moms', 'Cutting Edge', 'Red Spade', 'Lake', 'American', 'Walrus', 'Pearl', 'Good', 'Top Measure', 'Portsmouth', 'Toucan', 'Applause', 'Swell', 'Green Ribbon', 'Big City', 'National', 'Blue Medal', 'Urban', 'Jumbo', 'Giant', 'Dollar', 'Mighty Good', 'Robust', 'Gauss', 'Excel', 'Radius', 'Best', 'Jeffers', 'Johnson', 'Special', 'Akron', 'Framton', 'Black Tie', 'Queen', 'James Bay', 'Toretti', 'Prelude', 'Symphony', 'ADJ', 'King']
store_type : ['Deluxe Supermarket', 'Supermarket', 'Gourmet Supermarket', 'Small Grocery', 'Mid-Size Grocery']
store_city : ['Salem', 'Tacoma', 'Seattle', 'Spokane', 'Los Angeles', 'Beverly Hills', 'Portland', 'Bellingham', 'Orizaba', 'Merida', 'Hidalgo', 'Mexico City', 'Vancouver', 'Bremerton', 'Camacho', 'Guadalajara', 'Acapulco', 'San Francisco', 'Victoria']
store_state : ['OR', 'WA', 'CA', 'Veracruz', 'Yucatan', 'Zacatecas', 'DF', 'BC', 'Jalisco', 'Guerrero']
media_type : ['Daily Paper, Radio', 'In-Store Coupon', 'Radio', 'Daily Paper', 'Product Attachment', 'Sunday Paper, Radio', 'Sunday Paper, Radio, TV', 'Sunday Paper', 'Street Handout', 'TV', 'Bulk Mail', 'Cash Register Handout', 'Daily Paper, Radio, TV']
In [ ]:
#so as we see from the unique values of categorial features that we wnna make some text manipulation
for col in df.columns:
  if '$' in df[col].to_string():
            df[col + '_no_dollar'] = df[col].str.replace('$', '').str.replace(',', '')


        
In [ ]:
df["media_type"].value_counts()
Out[ ]:
Daily Paper, Radio         6820
Product Attachment         5371
Daily Paper, Radio, TV     5284
Daily Paper                5119
Street Handout             5069
Radio                      4980
Sunday Paper               4859
In-Store Coupon            4495
Sunday Paper, Radio        4050
Cash Register Handout      4002
TV                         3576
Bulk Mail                  3457
Sunday Paper, Radio, TV    3346
Name: media_type, dtype: int64
In [ ]:
df["low_fat"].unique()
Out[ ]:
array([0., 1.])

ㅤHandling outlairsㅤ

In [ ]:
df.columns
Out[ ]:
Index(['food_category', 'food_department', 'food_family',
       'store_sales(in millions)', 'store_cost(in millions)',
       'unit_sales(in millions)', 'promotion_name', 'sales_country',
       'marital_status', 'gender', 'total_children', 'education',
       'member_card', 'occupation', 'houseowner', 'avg_cars_at home(approx)',
       'avg. yearly_income', 'num_children_at_home',
       'avg_cars_at home(approx).1', 'brand_name', 'SRP', 'gross_weight',
       'net_weight', 'recyclable_package', 'low_fat', 'units_per_case',
       'store_type', 'store_city', 'store_state', 'store_sqft', 'grocery_sqft',
       'frozen_sqft', 'meat_sqft', 'coffee_bar', 'video_store', 'salad_bar',
       'prepared_food', 'florist', 'media_type', 'cost',
       'avg. yearly_income_no_dollar'],
      dtype='object')

so these outlairs that we wanna handle

In [ ]:
numeric_data=df[["store_sales(in millions)","store_cost(in millions)","meat_sqft","frozen_sqft","unit_sales(in millions)"]]
numeric_data.plot(subplots =True, kind = 'box', layout = (5,8), figsize = (30,30),patch_artist=True,color="#459E97")
plt.subplots_adjust(wspace = 0.5)
plt.style.use("ggplot")
plt.show()
In [ ]:
def out_lairs(col):
    """
    this function take the name of column and compute the median(Q2)represent 50% percentage
    and also compute the Q3(75%)from the boxblot then compute the inter_quantile_range 
    and then we compute the upper limit and the lowerlimit to show the origin of the outlairs
    
    """
    Q1 = df[col].quantile(0.25)
    Q3 =  df[col].quantile(0.75)
    IQR = Q3-Q1
    data_out = df.loc[(df[col] < (Q1-1.5*IQR)) | (df[col] > (Q3+1.5*IQR))][:10]
    
    return data_out 
In [ ]:
out_lairs("store_sales(in millions)")
Out[ ]:
food_category food_department food_family store_sales(in millions) store_cost(in millions) unit_sales(in millions) promotion_name sales_country marital_status gender ... frozen_sqft meat_sqft coffee_bar video_store salad_bar prepared_food florist media_type cost avg. yearly_income_no_dollar
72 Breakfast Foods Frozen Foods Food 16.200 5.346 5.000 Two for One USA S F ... 4923.000 3282.000 0.000 0.000 0.000 0.000 0.000 Daily Paper, Radio, TV 57.520 50K - 70K
102 Breakfast Foods Frozen Foods Food 16.200 6.156 5.000 Weekend Markdown USA S M ... 2452.000 1635.000 0.000 0.000 0.000 0.000 0.000 Daily Paper 140.570 50K - 70K
138 Breakfast Foods Frozen Foods Food 16.200 5.508 5.000 Two for One Mexico M M ... 4819.000 3213.000 0.000 0.000 0.000 0.000 0.000 Cash Register Handout 123.630 130K - 150K
171 Breakfast Foods Frozen Foods Food 16.200 5.508 5.000 Price Slashers Canada S M ... 4016.000 2678.000 1.000 1.000 1.000 1.000 1.000 Sunday Paper, Radio, TV 77.240 70K - 90K
371 Bread Baked Goods Food 19.650 6.288 5.000 Dollar Cutters USA M M ... 4746.000 3164.000 1.000 0.000 0.000 0.000 0.000 Daily Paper, Radio, TV 144.180 30K - 50K
384 Bread Baked Goods Food 18.700 6.919 5.000 Shelf Clearing Days USA M M ... 4746.000 3164.000 1.000 0.000 0.000 0.000 0.000 Radio 99.770 10K - 30K
407 Bread Baked Goods Food 19.300 7.141 5.000 Two for One USA M M ... 4923.000 3282.000 0.000 0.000 0.000 0.000 0.000 Daily Paper, Radio, TV 57.520 30K - 50K
457 Bread Baked Goods Food 18.700 6.358 5.000 Two for One USA M M ... 4923.000 3282.000 0.000 0.000 0.000 0.000 0.000 Daily Paper, Radio, TV 57.520 130K - 150K
673 Bread Baked Goods Food 19.650 8.253 5.000 Money Savers Mexico M M ... 6393.000 4262.000 1.000 1.000 1.000 1.000 1.000 TV 65.700 130K - 150K
850 Bread Baked Goods Food 18.700 6.545 5.000 Price Slashers Canada M F ... 4016.000 2678.000 1.000 1.000 1.000 1.000 1.000 Sunday Paper, Radio, TV 77.240 30K - 50K

10 rows × 41 columns

In [ ]:
mask = np.zeros_like(df.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)]= True


fig, ax = plt.subplots(figsize=(35, 27))

heatmap = sns.heatmap(df.corr(),
                      mask = mask,
                      square = True,
                      linewidths = .7,
                      cmap = 'PuBuGn',
                      cbar_kws = {'shrink': .8,"ticks" : [-1, -0.5, 0, 0.5, 1]},
                      vmin = -1,
                      vmax = 1,
                      annot = True,
                      annot_kws = {"size": 10})

#add the column names as labels
ax.set_yticklabels(df.corr(), rotation = 0)
ax.set_xticklabels(df.corr())

sns.set_style({'xtick.bottom': True}, {'ytick.left': True})

ax.annotate('correlation',
                    fontsize=10,fontweight='bold',
                    xy=(2.3, 4.2), xycoords='data',
                    xytext=(0.6, 0.95), textcoords='axes fraction',
                    arrowprops=dict(
                    facecolor=theme[4], shrink=0.025, 
                    connectionstyle='arc3, rad=0.80'),
                    horizontalalignment='left', verticalalignment='top'
)

ax.set_title('Correlation of Features', size = 50, color = theme[3], weight='bold', pad=40)
Out[ ]:
Text(0.5, 1.0, 'Correlation of Features')

I choose the thresholds with 0.8 and -0.8 so features that greater than 0.8 or lesser than -0.8 are considered as strong positive correlation and strong negative correlation respectively

  • As the charts show that prepared_food and salad_bar are strong positive relationship (they have 1 for correlation point), avg_cars_at_home(approx) with avg_cars_at home(approx).1 for 1 followed by net_weight and gross_weight with 0.99

meat_sqft and frozen_sqft have strong correlation with 1

  • store_sales and store_cost also so a high correlation with 0.95 and grocery_sqft with store_sqft for 0.91

  • SRP has high correlation with store_sales and store_cost for 0.83 and 0.8 respectivvely

  • In conclusion, I chosed to remove salad_bar, gross_weight, avg_cars_at home(approx).1, store_sales, store_cost, meat_sqft, store_sqft

Shapiro-Wilk Normality Testㅤ

In [ ]:
numeric_data=df.select_dtypes(exclude=["object"])
numeric_feature = df.dtypes!=object
final_numeric_feature = df.columns[numeric_feature].tolist()
In [ ]:
from scipy.stats import shapiro
for i in range(len(final_numeric_feature)):
    stat,p = shapiro(df[final_numeric_feature[i]])
    print(final_numeric_feature[i])
    print('P_value=%.3f' % (p))
    print("*******************************")
store_sales(in millions)
P_value=0.000
*******************************
store_cost(in millions)
P_value=0.000
*******************************
unit_sales(in millions)
P_value=0.000
*******************************
total_children
P_value=0.000
*******************************
avg_cars_at home(approx)
P_value=0.000
*******************************
num_children_at_home
P_value=0.000
*******************************
avg_cars_at home(approx).1
P_value=0.000
*******************************
SRP
P_value=0.000
*******************************
gross_weight
P_value=0.000
*******************************
net_weight
P_value=0.000
*******************************
recyclable_package
P_value=0.000
*******************************
low_fat
P_value=0.000
*******************************
units_per_case
P_value=0.000
*******************************
store_sqft
P_value=0.000
*******************************
grocery_sqft
P_value=0.000
*******************************
frozen_sqft
P_value=0.000
*******************************
meat_sqft
P_value=0.000
*******************************
coffee_bar
P_value=0.000
*******************************
video_store
P_value=0.000
*******************************
salad_bar
P_value=0.000
*******************************
prepared_food
P_value=0.000
*******************************
florist
P_value=0.000
*******************************
cost
P_value=0.000
*******************************

In Shapiro-Wilk Normality Test : H0 : The data are normally distributed

According to the P_value obtained from the Shapiro test (p_value &lt 5%), we reject our null hypothesis (H0) --> So the numerical fields do not follow the normal distribution.

as we see that we have skewness and outlairs lets analyisse the origin of these outalirs to see hoe we are gonna hadle it

In [ ]:
numeric_data=df.select_dtypes(exclude=["object"])
numeric_data.shape
Out[ ]:
(60428, 23)
In [ ]:
plt.figure(figsize = (30,30))
for ax, col in enumerate(numeric_data.columns[:24]):
    plt.subplot(6,4, ax + 1)
    plt.title(col)
    plotting = sns.kdeplot(x = numeric_data[col],fill=True, common_norm=False, color="#E68193",alpha=.9, linewidth=3);
    
plt.tight_layout()
plt.title('Distribution')
plotting.figure.suptitle(' Distribution of features ',y=1.08, size = 26, color = theme[3], weight='bold');

we can see here that there are categorial variables but in numerical features and we have a skewed fetures also

Welch's t-test

H0: There are differences between lst_0 and lst_1 aka mean(lst_0) == mean(lst_1)

Ha: There are NO differences between lst0 and lst_1 aka mean(lst_0) != mean(lst_1)

In [ ]:
from scipy.stats import ttest_ind
lst_cate = [ 'recyclable_package','low_fat', 'coffee_bar', 'video_store', 'prepared_food', 'florist']
alpha = .05
for i in lst_cate:
    lst_0 = df.cost[df[i] == 0]
    lst_1 = df.cost[df[i] == 1]
    t, p = ttest_ind(lst_0, lst_1, equal_var = False)
#--------------------------------------------------------------------------------
    print(f'p-value = {p:.2f}, alpha = {alpha:.2f}')
    if (p < alpha):
        print('p < alpha => reject H0 => there are differences between', i, 'and cost')
    else:
        print('p > alpha => failed to reject H0 => there are NO differences between', i, 'and cost')
    print('\n')
p-value = 0.67, alpha = 0.05
p > alpha => failed to reject H0 => there are NO differences between recyclable_package and cost


p-value = 0.30, alpha = 0.05
p > alpha => failed to reject H0 => there are NO differences between low_fat and cost


p-value = 0.00, alpha = 0.05
p < alpha => reject H0 => there are differences between coffee_bar and cost


p-value = 0.00, alpha = 0.05
p < alpha => reject H0 => there are differences between video_store and cost


p-value = 0.00, alpha = 0.05
p < alpha => reject H0 => there are differences between prepared_food and cost


p-value = 0.00, alpha = 0.05
p < alpha => reject H0 => there are differences between florist and cost


In [ ]:
df["unit_sales"]=df["store_sales(in millions)"]-df["store_cost(in millions)"]
df["unit_sales"]
Out[ ]:
0       4.637
1       2.926
2       2.318
3       2.502
4       2.652
         ... 
60423   1.435
60424   1.104
60425   2.981
60426   5.713
60427   4.968
Name: unit_sales, Length: 60428, dtype: float64

📺Exloratory Data Analysis

The main goal of this content is find the most features that affect to the prediction of cost to acquire a customer via media so I will focus on EDA and inferential analytics if necessary to explore the features

In [ ]:
#from dataprep.datasets import load_dataset
#from dataprep.eda import create_report

#report=create_report(df)

#report.save("projectcost")
#report

UNI_VARIATE_ANALYSIS

In [ ]:
import plotly.express as px
division_rat = px.pie(df, names='total_children', values='total_children', hole=0.6, title='Overall total children',
                      color_discrete_sequence=px.colors.qualitative.T10)
division_rat.update_traces(textfont=dict(color='#fff'))
division_rat.update_layout(autosize=True, height=300, width=800,
                           margin=dict(t=80, b=30, l=70, r=40),
                           plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                           title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                           font=dict(color='#8a8d93'),
                           legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
                           )
In [ ]:
df["total_children"].value_counts()
Out[ ]:
2.000    12518
4.000    12427
3.000    11921
1.000    11770
5.000     6168
0.000     5624
Name: total_children, dtype: int64

COUNT OF UNIT_SALES ,WHAT IS THE higest ten food_category sold and food_department?¶

In [ ]:
import plotly.express as px
division_rat = px.pie(df, names='unit_sales(in millions)', values='unit_sales(in millions)', hole=0.7, title='Overall unit_sales',
                      color_discrete_sequence=px.colors.qualitative.T10)
division_rat.update_traces(textfont=dict(color='#fff'))
division_rat.update_layout(autosize=True, height=300, width=500,
                           margin=dict(t=80, b=30, l=70, r=40),
                           plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                           title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                           font=dict(color='#8a8d93'),
                           legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
                           )

BIVARIATE_ANALYSIS

FOOD_CATEGORY¶

In [ ]:
data=df.groupby("food_category")[["unit_sales(in millions)"]].sum().sort_values(by=['unit_sales(in millions)'],ascending=[False]).reset_index()
px.bar(data, x= 'food_category', y="unit_sales(in millions)",color='food_category',color_discrete_sequence=px.colors.qualitative.Bold)

what is the most highest ten food_category achieve the highest net salary?¶

In [ ]:
grp_product = df.groupby('food_category')
sales_by_product = grp_product['unit_sales'].mean().reset_index()
sales_by_product = sales_by_product.sort_values(by='unit_sales', ascending=False)
sales_by_product[:10]
Out[ ]:
food_category unit_sales
37 Plastic Products 4.839
34 Pain Relievers 4.698
11 Canned Shrimp 4.669
7 Canned Anchovies 4.657
19 Drinks 4.351
21 Electrical 4.320
43 Starchy Foods 4.243
29 Kitchen Products 4.225
23 Frozen Entrees 4.224
41 Snack Foods 4.135
In [ ]:
plt.figure(figsize=(15,6))
plt.barh(sales_by_product[:10]['food_category'], sales_by_product[:10]['unit_sales'])
plt.xticks(rotation='vertical', size=8)
plt.xlabel('Sales in millions')
plt.ylabel('Product')
plt.title('Top 10 Products With Most Sales')
for index, value in enumerate(sales_by_product[:10]['unit_sales']):
    plt.text(value, index,
             str(value))
plt.show()

as we see that the (pain relievers and plastic products and canned shrimp) achieve the highest net sales

what is the three higest food category in each highest supermarket?¶

In [ ]:
supermaret_store = df[df['store_type']=="Supermarket"] 
duluxe_supermaret_store = df[df['store_type']=="Deluxe Supermarket"] 
Gourmet_supermaret_store = df[df['store_type']=="Gourmet Supermarket"]
#Mid_Size_Grocery_store = data[data['store_type']=="Mid-Size Grocery"]
#Small_Grocery_store = data[data['store_type']=="Small Grocery"]

stores=["Supermarket","Deluxe Supermarket","Gourmet Supermarket"]
fig, ax = plt.subplots(1,3, figsize=(36,10))

for i,store in enumerate([supermaret_store , duluxe_supermaret_store, Gourmet_supermaret_store]):
    fig.sca(ax[i])
    df_category_quantity = store.groupby('food_category')['unit_sales'].mean().reset_index()
    df_category_quantity.sort_values(by='unit_sales', ascending=False)
    plt.barh(df_category_quantity[:10]['food_category'], df_category_quantity[:10]['unit_sales'])

    plt.xticks(rotation='vertical', size=20)
    plt.yticks(size=20)

    plt.xlabel('Quantity')
    plt.ylabel('Category')
    plt.title('net salary of of each food category   in Each super market')
    plt.title(stores[i], fontsize=20,color='brown') 
fig.suptitle('net salary of of each food category in Each super market',fontsize=20, color='brown', y=.95)


   
Out[ ]:
Text(0.5, 0.95, 'net salary of of each food category in Each super market')

in the three highest supermarket sales the highest food category is cannedanchovies

In [ ]:
food = df['food_family'].value_counts().reset_index()
food
Out[ ]:
index food_family
0 Food 43284
1 Non-Consumable 11573
2 Drink 5571

WHAT IS THE HIGHEST FOOD FAMILY?¶

In [ ]:
classname2 = px.histogram(df, x='food_family',
                         title='COUNT OF FOOD_FAMILY', height=300,width=600
                         ,color_discrete_sequence=['#03DAC5'],
                         )
classname2.update_yaxes(showgrid=False),
classname2.update_xaxes(categoryorder='total descending')
classname2.update_traces(hovertemplate=None)
classname2.update_layout(margin=dict(t=100, b=0, l=70, r=40),
                        hovermode="x unified",
                        xaxis_tickangle=360,
                        xaxis_title=' ', yaxis_title=" ",
                        plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                        title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                        font=dict(color='#8a8d93'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
                          )

as we see that the highest food family is food bc the most things the food mart selled is food.

WHICH MARITAL STATUES MOST EFFECTIVE ON FOOD_FAMILY TYPES?¶

In [ ]:
classname = px.histogram(df, x='food_family', color='marital_status',
                         title='FOOD_FAMILY BASED ON MARITAL STATUS',height=300,width=600,
                         category_orders={'marital_status': ['S', 'M']},
                         color_discrete_sequence=['#DB6574', '#03DAC5'],
                         )
classname.update_yaxes(showgrid=False),
classname.update_xaxes(categoryorder='total descending')
classname.update_traces(hovertemplate=None)
classname.update_layout(margin=dict(t=100, b=0, l=70, r=40),
                        hovermode="x unified",
                        xaxis_tickangle=360,
                        xaxis_title=' ', yaxis_title=" ",
                        plot_bgcolor='#2d3035', paper_bgcolor='#2d3035',
                        title_font=dict(size=25, color='#a5a7ab', family="Muli, sans-serif"),
                        font=dict(color='#8a8d93'),
                        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
                          )

in food ⁉ married is less than single

in non_consumble states: married is high than single

in Drink: married is high than single

Products performing better and worse in Tier 1 cities v/s Tier 2 & 3 cities in terms of Sales?¶

In [ ]:
top_region = df.groupby('sales_country')['unit_sales'].sum().sort_values(ascending=False).head(10)
top_region = top_region.reset_index()
In [ ]:
import plotly.graph_objects as go
fig = go.Figure(go.Pie(labels=top_region.sales_country,
                             values = top_region.unit_sales,
                             customdata = top_region.unit_sales,
                            hovertemplate = "Region:%{label} <br> Sales: %{customdata} <extra></extra>",
                           pull= [0.1,0,0,0.1],
                           
                            ))

fig.update_layout(
                   title={
                        'text': "<b>Sales in different Regions",
                        'y':0.9,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                 
                  )


fig.show()

as we see above that USA has the highest net salaries than the othe countries cause it has the most num stores

In [ ]:
df_tier1=pd.DataFrame(df.loc[df['sales_country'] =='USA'])#for tier 1
df_tier2=pd.DataFrame(df.loc[df['sales_country'] =='Mexico']) #for tier 2
df_tier3=pd.DataFrame(df.loc[df['sales_country'] =='Canada']) #for tier 3
#df_cities=pd.concat([df_tier2,df_tier3],axis=0) #combining dfs for tier 2 & 3
In [ ]:
df_tier1_list=df_tier1.groupby("food_department")[["cost"]].mean().sort_values(by=['cost'],ascending=[False]).reset_index()
df_tier1_list.rename(columns = {'food_department':'Items_Tier_1','cost':'Sales_of_usa'}, inplace = True)
df_tier1_list
Out[ ]:
Items_Tier_1 Sales_of_usa
0 Checkout 101.306
1 Breakfast Foods 99.904
2 Canned Foods 99.501
3 Meat 99.470
4 Deli 99.273
5 Carousel 99.064
6 Snack Foods 98.970
7 Baked Goods 98.867
8 Eggs 98.854
9 Frozen Foods 98.837
10 Starchy Foods 98.732
11 Seafood 98.709
12 Baking Goods 98.463
13 Dairy 98.454
14 Canned Products 98.154
15 Alcoholic Beverages 98.097
16 Health and Hygiene 97.798
17 Beverages 97.792
18 Produce 97.749
19 Household 97.711
20 Snacks 96.917
21 Periodicals 96.263
In [ ]:
df_tier2_list=df_tier2.groupby("food_department")[["cost"]].mean().sort_values(by=['cost'],ascending=[False]).reset_index()
df_tier2_list.rename(columns = {'food_department':'Items_Tier2','cost':'Sales_of_Mexico'}, inplace = True)
df_tier2_list
Out[ ]:
Items_Tier2 Sales_of_Mexico
0 Periodicals 102.395
1 Canned Foods 102.146
2 Beverages 101.972
3 Baking Goods 101.732
4 Eggs 101.351
5 Health and Hygiene 101.145
6 Household 101.129
7 Snack Foods 100.796
8 Frozen Foods 100.785
9 Alcoholic Beverages 100.699
10 Snacks 100.682
11 Deli 100.627
12 Seafood 100.572
13 Carousel 100.197
14 Starchy Foods 100.112
15 Produce 100.029
16 Baked Goods 99.903
17 Canned Products 99.871
18 Dairy 99.188
19 Meat 99.084
20 Breakfast Foods 98.168
21 Checkout 97.591
In [ ]:
df_tier3_list=df_tier3.groupby("food_department")[["cost"]].mean().sort_values(by=['cost'],ascending=[False]).reset_index()
df_tier3_list.rename(columns = {'food_department':'Items_Tier3','cost':'Sales_of_canda'}, inplace = True)
df_tier3_list
Out[ ]:
Items_Tier3 Sales_of_canda
0 Snacks 109.589
1 Periodicals 105.818
2 Canned Products 104.617
3 Health and Hygiene 103.927
4 Baked Goods 103.757
5 Breakfast Foods 102.233
6 Produce 101.861
7 Snack Foods 101.245
8 Frozen Foods 101.232
9 Deli 101.201
10 Checkout 101.075
11 Eggs 100.990
12 Alcoholic Beverages 100.619
13 Household 99.459
14 Beverages 99.355
15 Dairy 99.327
16 Baking Goods 99.213
17 Starchy Foods 98.701
18 Canned Foods 98.652
19 Carousel 98.185
20 Meat 96.982
21 Seafood 95.221
In [ ]:
df_sales=pd.concat([df_tier1_list,df_tier2_list,df_tier3_list],axis=1)
df_sales
Out[ ]:
Items_Tier_1 Sales_of_usa Items_Tier2 Sales_of_Mexico Items_Tier3 Sales_of_canda
0 Checkout 101.306 Periodicals 102.395 Snacks 109.589
1 Breakfast Foods 99.904 Canned Foods 102.146 Periodicals 105.818
2 Canned Foods 99.501 Beverages 101.972 Canned Products 104.617
3 Meat 99.470 Baking Goods 101.732 Health and Hygiene 103.927
4 Deli 99.273 Eggs 101.351 Baked Goods 103.757
5 Carousel 99.064 Health and Hygiene 101.145 Breakfast Foods 102.233
6 Snack Foods 98.970 Household 101.129 Produce 101.861
7 Baked Goods 98.867 Snack Foods 100.796 Snack Foods 101.245
8 Eggs 98.854 Frozen Foods 100.785 Frozen Foods 101.232
9 Frozen Foods 98.837 Alcoholic Beverages 100.699 Deli 101.201
10 Starchy Foods 98.732 Snacks 100.682 Checkout 101.075
11 Seafood 98.709 Deli 100.627 Eggs 100.990
12 Baking Goods 98.463 Seafood 100.572 Alcoholic Beverages 100.619
13 Dairy 98.454 Carousel 100.197 Household 99.459
14 Canned Products 98.154 Starchy Foods 100.112 Beverages 99.355
15 Alcoholic Beverages 98.097 Produce 100.029 Dairy 99.327
16 Health and Hygiene 97.798 Baked Goods 99.903 Baking Goods 99.213
17 Beverages 97.792 Canned Products 99.871 Starchy Foods 98.701
18 Produce 97.749 Dairy 99.188 Canned Foods 98.652
19 Household 97.711 Meat 99.084 Carousel 98.185
20 Snacks 96.917 Breakfast Foods 98.168 Meat 96.982
21 Periodicals 96.263 Checkout 97.591 Seafood 95.221

in each country which food department achieve highest net salary comparing to other coyuntries?¶

In [ ]:
import plotly.graph_objects as go   #importing the library
x=df_sales['Items_Tier_1']
y1=df_sales['Sales_of_usa']
y2=df_sales['Sales_of_Mexico']
y3=df_sales['Sales_of_canda']

# Create traces

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1,
                    mode='lines+markers',line_color='rgb(0,100,80)',
                    name='Sales_of_usa'))
fig.add_trace(go.Scatter(x=x, y=y2,
                    mode='lines+markers',
                    name='Sales_of_Mexico'))
fig.add_trace(go.Scatter(x=x, y=y3,
                    mode='lines+markers',
                    name='Sales_of_canda'))

fig.show()

as we see from above that CANADA has the highest net sales for every food department expect from Dairy food department become to decrease

and in canda we can see that it has the lowest net salaries comparing to the other countries expect snacks

in MEXICO we see that it is in the middle of salaries expect Backing Goods and and begin to have the highest salaries to periodicals

In [ ]:
df_tier1_list=df_tier1.groupby("food_department")[["unit_sales"]].mean().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
df_tier1_list.rename(columns = {'food_department':'Items_Tier_1','unit_sales':'Sales_of_usa'}, inplace = True)
df_tier1_list
Out[ ]:
Items_Tier_1 Sales_of_usa
0 Starchy Foods 4.325
1 Snack Foods 4.125
2 Household 4.102
3 Seafood 4.029
4 Dairy 4.025
5 Produce 3.994
6 Eggs 3.969
7 Checkout 3.969
8 Baked Goods 3.946
9 Breakfast Foods 3.937
10 Snacks 3.917
11 Meat 3.912
12 Deli 3.904
13 Canned Foods 3.802
14 Frozen Foods 3.794
15 Alcoholic Beverages 3.793
16 Periodicals 3.772
17 Health and Hygiene 3.768
18 Beverages 3.740
19 Baking Goods 3.537
20 Carousel 3.517
21 Canned Products 3.324
In [ ]:
df_tier2_list=df_tier2.groupby("food_department")[["unit_sales"]].mean().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
df_tier2_list.rename(columns = {'food_department':'Items_Tier2','unit_sales':'Sales_of_Mexico'}, inplace = True)
df_tier2_list
Out[ ]:
Items_Tier2 Sales_of_Mexico
0 Snack Foods 4.164
1 Household 4.162
2 Produce 4.041
3 Dairy 4.025
4 Seafood 4.013
5 Starchy Foods 3.978
6 Snacks 3.972
7 Eggs 3.961
8 Periodicals 3.939
9 Baked Goods 3.925
10 Beverages 3.885
11 Canned Foods 3.873
12 Frozen Foods 3.865
13 Alcoholic Beverages 3.849
14 Deli 3.840
15 Meat 3.835
16 Checkout 3.772
17 Breakfast Foods 3.762
18 Health and Hygiene 3.708
19 Baking Goods 3.596
20 Canned Products 3.483
21 Carousel 3.327
In [ ]:
df_tier3_list=df_tier3.groupby("food_department")[["unit_sales"]].mean().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
df_tier3_list.rename(columns = {'food_department':'Items_Tier2','unit_sales':'Sales_of_canada'}, inplace = True)
df_tier3_list
Out[ ]:
Items_Tier2 Sales_of_canada
0 Eggs 5.074
1 Starchy Foods 4.527
2 Snacks 4.261
3 Household 4.235
4 Meat 4.166
5 Snack Foods 4.108
6 Deli 4.013
7 Baked Goods 3.996
8 Produce 3.959
9 Breakfast Foods 3.927
10 Checkout 3.899
11 Frozen Foods 3.898
12 Periodicals 3.892
13 Canned Foods 3.836
14 Beverages 3.815
15 Health and Hygiene 3.798
16 Baking Goods 3.796
17 Dairy 3.780
18 Alcoholic Beverages 3.693
19 Canned Products 3.526
20 Carousel 3.381
21 Seafood 3.082
In [ ]:
df_sales=pd.concat([df_tier1_list,df_tier2_list,df_tier3_list],axis=1)
df_sales
Out[ ]:
Items_Tier_1 Sales_of_usa Items_Tier2 Sales_of_Mexico Items_Tier2 Sales_of_canada
0 Starchy Foods 4.325 Snack Foods 4.164 Eggs 5.074
1 Snack Foods 4.125 Household 4.162 Starchy Foods 4.527
2 Household 4.102 Produce 4.041 Snacks 4.261
3 Seafood 4.029 Dairy 4.025 Household 4.235
4 Dairy 4.025 Seafood 4.013 Meat 4.166
5 Produce 3.994 Starchy Foods 3.978 Snack Foods 4.108
6 Eggs 3.969 Snacks 3.972 Deli 4.013
7 Checkout 3.969 Eggs 3.961 Baked Goods 3.996
8 Baked Goods 3.946 Periodicals 3.939 Produce 3.959
9 Breakfast Foods 3.937 Baked Goods 3.925 Breakfast Foods 3.927
10 Snacks 3.917 Beverages 3.885 Checkout 3.899
11 Meat 3.912 Canned Foods 3.873 Frozen Foods 3.898
12 Deli 3.904 Frozen Foods 3.865 Periodicals 3.892
13 Canned Foods 3.802 Alcoholic Beverages 3.849 Canned Foods 3.836
14 Frozen Foods 3.794 Deli 3.840 Beverages 3.815
15 Alcoholic Beverages 3.793 Meat 3.835 Health and Hygiene 3.798
16 Periodicals 3.772 Checkout 3.772 Baking Goods 3.796
17 Health and Hygiene 3.768 Breakfast Foods 3.762 Dairy 3.780
18 Beverages 3.740 Health and Hygiene 3.708 Alcoholic Beverages 3.693
19 Baking Goods 3.537 Baking Goods 3.596 Canned Products 3.526
20 Carousel 3.517 Canned Products 3.483 Carousel 3.381
21 Canned Products 3.324 Carousel 3.327 Seafood 3.082
In [ ]:
import plotly.graph_objects as go   #importing the library
x=df_sales['Items_Tier_1']
y1=df_sales['Sales_of_usa']
y2=df_sales['Sales_of_Mexico']
y3=df_sales['Sales_of_canada']

# Create traces

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1,
                    mode='lines+markers',line_color='rgb(0,100,80)',
                    name='Sales_of_usa'))
fig.add_trace(go.Scatter(x=x, y=y2,
                    mode='lines+markers',
                    name='Sales_of_Mexico'))
fig.add_trace(go.Scatter(x=x, y=y3,
                    mode='lines+markers',
                    name='Sales_of_canda'))

fig.show()

as we see from above that CANADA has the highest net sales for starchy food and snack food department except from canned products department become to decrease

what is the highest cities according to net sales?¶

In [ ]:
top_region = df.groupby('store_city')['unit_sales'].mean().sort_values(ascending=False).head(10)
top_region = top_region.reset_index()
In [ ]:
fig = go.Figure(go.Pie(labels=top_region.store_city,
                             values = top_region.unit_sales,
                             customdata = top_region.unit_sales,
                            hovertemplate = "Region:%{label} <br> Sales: %{customdata} <extra></extra>",
                           pull= [0.1,0,0,0.1],
                           
                            ))

fig.update_layout(
                   title={
                        'text': "<b>Sales in different Regions",
                        'y':0.9,
                        'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'},
                 
                  )


fig.show()

the highest two cities according to net sales is portland and merida

In [ ]:
df_tier1=pd.DataFrame(df.loc[df['store_city'] =='Tacoma'])#for tier 1
df_tier2=pd.DataFrame(df.loc[df['store_city'] =='Portland']) #for tier 2
df_tier3=pd.DataFrame(df.loc[df['store_city'] =='Seattle']) #for tier 3
df_cities=pd.concat([df_tier2,df_tier3],axis=0) #combining dfs for tier 2 & 3
In [ ]:
df_tier1_list=df_tier1.groupby("food_department")[["unit_sales"]].sum().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
df_tier1_list.rename(columns = {'food_department':'Items_Tier_1','unit_sales':'Sales_tier_1'}, inplace = True)
In [ ]:
df_cities_list=df_cities.groupby("food_department")[["unit_sales"]].sum().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
df_cities_list.rename(columns = {'food_department':'Items_Tier2_3','unit_sales':'Sales_tier2_3'}, inplace = True)
In [ ]:
df3_sales=pd.concat([df_tier1_list,df_cities_list],axis=1)
df3_sales
Out[ ]:
Items_Tier_1 Sales_tier_1 Items_Tier2_3 Sales_tier2_3
0 Produce 3310.340 Produce 5848.674
1 Snack Foods 2588.700 Snack Foods 5080.282
2 Household 2424.999 Household 4384.266
3 Frozen Foods 2351.870 Frozen Foods 4028.983
4 Baking Goods 1607.119 Canned Foods 2729.122
5 Canned Foods 1551.089 Dairy 2726.800
6 Dairy 1473.343 Baking Goods 2671.489
7 Health and Hygiene 1410.649 Health and Hygiene 2550.390
8 Beverages 1120.764 Deli 1998.781
9 Deli 976.781 Beverages 1756.473
10 Baked Goods 708.268 Alcoholic Beverages 1220.921
11 Snacks 618.862 Baked Goods 1169.418
12 Alcoholic Beverages 533.222 Snacks 939.734
13 Starchy Foods 413.745 Starchy Foods 914.263
14 Eggs 374.990 Eggs 738.982
15 Periodicals 374.792 Breakfast Foods 557.957
16 Breakfast Foods 266.940 Periodicals 545.851
17 Seafood 141.617 Seafood 278.238
18 Checkout 132.766 Meat 257.669
19 Canned Products 125.018 Canned Products 227.515
20 Meat 115.076 Checkout 219.303
21 Carousel 67.462 Carousel 152.418
In [ ]:
import plotly.graph_objects as go   #importing the library
x=df3_sales['Items_Tier_1']
y1=df3_sales['Sales_tier_1']
y2=df3_sales['Sales_tier2_3']

# Create traces

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1,
                    mode='lines+markers',line_color='rgb(0,100,80)',
                    name='Tacoma sales'))
fig.add_trace(go.Scatter(x=x, y=y2,
                    mode='lines+markers',
                    name='Portland AND Seattle sales'))

fig.show()
In [ ]:
df_tier1_list=df_tier1.groupby("food_department")[["cost"]].sum().sort_values(by=['cost'],ascending=[False]).reset_index()
df_tier1_list.rename(columns = {'food_department':'Items_Tier_1','cost':'Sales_tier_1'}, inplace = True)
In [ ]:
df_cities_list=df_cities.groupby("food_department")[["cost"]].sum().sort_values(by=['cost'],ascending=[False]).reset_index()
df_cities_list.rename(columns = {'food_department':'Items_Tier2_3','cost':'Sales_tier2_3'}, inplace = True)
In [ ]:
df_sales=pd.concat([df_tier1_list,df_cities_list],axis=1)
df_sales
Out[ ]:
Items_Tier_1 Sales_tier_1 Items_Tier2_3 Sales_tier2_3
0 Produce 78877.630 Produce 148866.540
1 Frozen Foods 60671.430 Snack Foods 124831.490
2 Snack Foods 60426.050 Frozen Foods 111532.940
3 Household 55489.220 Household 107777.820
4 Baking Goods 44282.790 Baking Goods 76650.790
5 Canned Foods 39356.360 Canned Foods 76555.460
6 Dairy 36122.980 Health and Hygiene 66211.650
7 Health and Hygiene 35746.410 Dairy 66183.500
8 Beverages 28394.320 Deli 50831.340
9 Deli 25747.330 Beverages 49155.040
10 Baked Goods 17832.200 Alcoholic Beverages 31319.450
11 Snacks 14829.280 Baked Goods 31018.590
12 Alcoholic Beverages 13648.780 Snacks 24451.440
13 Periodicals 9489.290 Starchy Foods 20801.030
14 Starchy Foods 9416.340 Eggs 18162.190
15 Eggs 8808.910 Periodicals 14483.540
16 Breakfast Foods 7266.300 Breakfast Foods 14237.040
17 Canned Products 3780.770 Seafood 7373.220
18 Seafood 3407.390 Meat 6568.490
19 Meat 3298.440 Canned Products 6476.770
20 Checkout 3121.800 Checkout 6150.020
21 Carousel 1533.160 Carousel 4710.220
In [ ]:
import plotly.graph_objects as go   #importing the library
x=df_sales['Items_Tier_1']
y1=df_sales['Sales_tier_1']
y2=df_sales['Sales_tier2_3']

# Create traces

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y1,
                    mode='lines+markers',line_color='rgb(0,100,80)',
                    name='Tacoma sales'))
fig.add_trace(go.Scatter(x=x, y=y2,
                    mode='lines+markers',
                    name='Portland AND Seattle sales'))

fig.show()
In [ ]:
store_type_relation = df.groupby("store_type")[["unit_sales"]].mean().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
px.bar(store_type_relation, x= 'store_type', y="unit_sales",color='store_type',color_discrete_sequence=px.colors.qualitative.Pastel1)
In [ ]:
duluxe_supermaret_store["promotion_name"].value_counts()
Out[ ]:
Cash Register Lottery     1555
Price Savers              1274
Double Down Sale          1158
Super Duper Savers        1090
Bag Stuffers              1009
Price Winners              975
Dimes Off                  926
One Day Sale               870
Money Savers               812
I Cant Believe It Sale     780
Price Slashers             762
Green Light Days           723
Big Promo                  682
Super Savers               639
Two Day Sale               626
Save-It Sale               620
Tip Top Savings            616
Sales Galore               578
Free For All               560
Sale Winners               532
Two for One                518
Shelf Clearing Days        506
Go For It                  460
Big Time Savings           438
Pick Your Savings          424
Three for One              405
Dollar Days                404
Big Time Discounts         391
Price Cutters              282
Price Smashers             278
Bye Bye Baby               263
Price Destroyers           258
Saving Days                257
High Roller Savings        228
Dollar Cutters             223
Weekend Markdown           220
Sales Days                 220
Green Light Special        200
Best Savings               192
Name: promotion_name, dtype: int64
In [ ]:
supermaret_store = df[df['store_type']=="Supermarket"] 
duluxe_supermaret_store = df[df['store_type']=="Deluxe Supermarket"] 
Gourmet_supermaret_store = df[df['store_type']=="Gourmet Supermarket"]
#Mid_Size_Grocery_store = data[data['store_type']=="Mid-Size Grocery"]
#Small_Grocery_store = data[data['store_type']=="Small Grocery"]

stores=["Supermarket","Deluxe Supermarket","Gourmet Supermarket"]
fig, ax = plt.subplots(1,3, figsize=(30,10))

for i,store in enumerate([supermaret_store , duluxe_supermaret_store, Gourmet_supermaret_store]):
    fig.sca(ax[i])
    df_category_quantity = store.groupby('promotion_name')['unit_sales'].sum().reset_index()
    df_category_quantity.sort_values(by='unit_sales', ascending=False)
    plt.barh(df_category_quantity[:10]['promotion_name'], df_category_quantity[:10]['unit_sales'])

    plt.xticks(rotation='vertical', size=20)
    plt.yticks(size=20)
    plt.xlabel('Quantity')
    plt.ylabel('Category')
    #plt.title(stores[i], fontsize=15,color='brown')
    #plt.title('net salary of of each poromotion  in Each super market')
    plt.title(stores[i], fontsize=18,color='brown') 
fig.suptitle('net salary of of each poromotion  in Each super market',fontsize=25, color='brown', y=.98)


    
Out[ ]:
Text(0.5, 0.98, 'net salary of of each poromotion  in Each super market')
  • in Supermarkets: the best two promotion are Big Time Discounts and Dollar Cutters.
  • in Deluxe Supermarkets: the best two promotion are Cash Register Lottery and Bag Stuffers.
  • in Gourmet Supermarkets: the best two promotion are High Roller Savings and One Day Sale
In [ ]:
 
In [ ]:
data=df[df["store_type"]=="Supermarket"]
data["store_city"].value_counts()
Out[ ]:
Portland       5150
Seattle        5051
Spokane        4453
Los Angeles    3960
Bremerton      3451
Orizaba        2621
Acapulco       1506
Name: store_city, dtype: int64
In [ ]:
df["store_type"].value_counts()
Out[ ]:
Supermarket            26192
Deluxe Supermarket     22954
Gourmet Supermarket     6503
Mid-Size Grocery        2846
Small Grocery           1933
Name: store_type, dtype: int64

what is the highest promotion based on sales ?¶

In [ ]:
df_category_quantity = df.groupby('promotion_name')['unit_sales'].sum().reset_index()
df_category_quantity.sort_values(by='unit_sales', ascending=False)
plt.figure(figsize=(12,6))
plt.barh(df_category_quantity[:10]['promotion_name'], df_category_quantity[:10]['unit_sales'])

plt.xticks(rotation='vertical', size=8)
plt.xlabel('Quantity')
plt.ylabel('poromotion')
plt.title('the highest promotion based on sales')
for index, value in enumerate(df_category_quantity[:10]['unit_sales']):
    plt.text(value, index,
             str(value))

plt.show()
  • the best 4 promotions is Big Time Discount , Cash register lottery , Dollar cutters and Dimes off

what is the best food department in each supermarket type ?¶

In [ ]:
supermaret_store = df[df['store_type']=="Supermarket"] 
duluxe_supermaret_store = df[df['store_type']=="Deluxe Supermarket"] 
Gourmet_supermaret_store = df[df['store_type']=="Gourmet Supermarket"]
#Mid_Size_Grocery_store = data[data['store_type']=="Mid-Size Grocery"]
#Small_Grocery_store = data[data['store_type']=="Small Grocery"]

stores=["Supermarket","Deluxe Supermarket","Gourmet Supermarket"]
fig, ax = plt.subplots(1,3, figsize=(36,10))

for i,store in enumerate([supermaret_store , duluxe_supermaret_store, Gourmet_supermaret_store]):
    fig.sca(ax[i])
    df_category_quantity = store.groupby('food_department')['unit_sales'].sum().reset_index()
    df_category_quantity.sort_values(by='unit_sales', ascending=False)
    plt.barh(df_category_quantity[:10]['food_department'], df_category_quantity[:10]['unit_sales'])

    plt.xticks(rotation='vertical', size=20)
    plt.yticks(size=20)

    plt.xlabel('Quantity')
    plt.ylabel('Category')
    plt.title('net salary of of each food department  in Each super market')
    plt.title(stores[i], fontsize=20,color='brown') 
fig.suptitle('net salary of of each food department in Each super market',fontsize=20, color='brown', y=.95)


   
Out[ ]:
Text(0.5, 0.95, 'net salary of of each food department in Each super market')
  • dairy and canned food and baking good is the most three department in each supermarket
  • carousel and checkout and canned products is the least three food department
In [ ]:
 

what is the best five food department according to net sales?¶

In [ ]:
food_department_relation = df.groupby("food_department")[["unit_sales"]].sum().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
px.bar(food_department_relation, x= 'food_department', y="unit_sales",color='food_department',color_discrete_sequence=px.colors.qualitative.Set3)

the best five department is produce, snake foods , houshold , frozen foods and canned foods

member_card with unit_sales

In [ ]:
df_category_quantity = df.groupby('member_card')['unit_sales'].sum().reset_index()
df_category_quantity.sort_values(by='unit_sales', ascending=False)
plt.figure(figsize=(12,6))
plt.barh(df_category_quantity[:10]['member_card'], df_category_quantity[:10]['unit_sales'])

plt.xticks(rotation='vertical', size=8)
plt.xlabel('Quantity')
plt.ylabel('Category')
plt.title('Number of Quantities Sold in Each Category')
for index, value in enumerate(df_category_quantity[:10]['unit_sales']):
    plt.text(value, index,
             str(value))
plt.show()
In [ ]:
df_category_quantity = df.groupby('food_department')['unit_sales(in millions)'].sum().reset_index()
df_category_quantity.sort_values(by='unit_sales(in millions)', ascending=False)
plt.figure(figsize=(12,6))
plt.barh(df_category_quantity[:10]['food_department'], df_category_quantity[:10]['unit_sales(in millions)'])

plt.xticks(rotation='vertical', size=8)
plt.xlabel('Quantity')
plt.ylabel('Category')
plt.title('Number of Quantities Sold in Each Category')
for index, value in enumerate(df_category_quantity[:10]['unit_sales(in millions)']):
    plt.text(value, index,
             str(value))
plt.show()

the baking goods department achieve the highest unit sales overall the all department

avg_net_sales(cost with unit_sales) for every promotion_name?¶

In [ ]:
most_SALARY=df.groupby(["food_category"])["store_sales(in millions)"].max().sort_values(ascending=False)[:10]
most_SALARY
Out[ ]:
food_category
Fruit              22.920
Canned Soup        19.900
Pain Relievers     19.900
Dairy              19.900
Vegetables         19.850
Hygiene            19.850
Snack Foods        19.800
Jams and Jellies   19.800
Kitchen Products   19.750
Drinks             19.750
Name: store_sales(in millions), dtype: float64

location and sales

In [ ]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
outlet_size_sales = df.groupby("member_card")[["cost"]].sum().reset_index()
outlet_size_sales.sort_values(by=['cost'],ascending=[False])
Out[ ]:
member_card cost
0 Bronze 3358180.070
2 Normal 1375474.260
1 Golden 746772.690
3 Silver 517799.240
In [ ]:
data = df.groupby("member_card")[["unit_sales"]].sum().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
px.bar(data, x= 'member_card', y="unit_sales",color='member_card',color_discrete_sequence=px.colors.qualitative.Set3)

yearly income with net_salary?

avg_yearly_income with the most unit sales?¶

In [ ]:
df_category_quantity = df.groupby('avg. yearly_income')['unit_sales'].sum().reset_index()
df_category_quantity.sort_values(by='unit_sales', ascending=False)
plt.figure(figsize=(12,6))
plt.barh(df_category_quantity[:10]['avg. yearly_income'], df_category_quantity[:10]['unit_sales'])

plt.xticks(rotation='vertical', size=8)
plt.xlabel('Quantity')
plt.ylabel('Category')
plt.title('Number of Quantities Sold in Each Category')
for index, value in enumerate(df_category_quantity[:10]['unit_sales']):
    plt.text(value, index,
             str(value))
plt.show()

media type¶

In [ ]:
store_type_relation = df.groupby("media_type")[["cost"]].mean().sort_values(by=['cost'],ascending=[False]).reset_index()
px.bar(store_type_relation, x= 'media_type', y="cost",color='media_type',color_discrete_sequence=px.colors.qualitative.Pastel1)
In [ ]:
store_type_relation = df.groupby("media_type")[["unit_sales"]].mean().sort_values(by=['unit_sales'],ascending=[False]).reset_index()
px.bar(store_type_relation, x= 'media_type', y="unit_sales",color='media_type',color_discrete_sequence=px.colors.qualitative.Pastel1)
In [ ]:
store_type_relation = df.groupby("media_type")[["cost"]].mean().sort_values(by=['cost'],ascending=[False]).reset_index()
px.bar(store_type_relation, x= 'media_type', y="cost",color='media_type',color_discrete_sequence=px.colors.qualitative.Pastel1)
  • cash register handout is the highest cost although the daily paper,radio is the highest frequency
  • we recommend to use TV and daily paper and radio more often
In [ ]:
usa_store = df[df['sales_country']=="USA"]  
Mexico_store = df[df['sales_country']=="Mexico"]  
Canada_store = df[df['sales_country']=="Canada"] 
#Mid_Size_Grocery_store = data[data['store_type']=="Mid-Size Grocery"] 
#Small_Grocery_store = data[data['store_type']=="Small Grocery"] 
 
stores=["USA"," Mexico","canada"] 
fig, ax = plt.subplots(1,3, figsize=(36,10)) 
 
for i,store in enumerate([usa_store , Mexico_store, Canada_store]): 
    fig.sca(ax[i]) 
    df_category_quantity = store.groupby('media_type')['cost'].mean().reset_index() 
    df_category_quantity.sort_values(by='cost', ascending=False) 
    plt.barh(df_category_quantity[:10]['media_type'], df_category_quantity[:10]['cost']) 
 
    plt.xticks(rotation='vertical', size=8) 
    plt.yticks(size=20)
    plt.xlabel('Quantity') 
    plt.ylabel('Category') 
    plt.title('net salary of of each poromotion  in Each super market')
    plt.title(stores[i], fontsize=20,color='brown') 
fig.suptitle('the highest media type according to cost in each country',fontsize=20, color='brown', y=.95)
Out[ ]:
Text(0.5, 0.95, 'the highest media type according to cost in each country')
  • in USA ,mexico and canada the highest cost is cash register
  • in usa , mexico the least cost is daily paper
  • in canada the least cost is radio
In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn import  metrics


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor

import xgboost as xgb
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor

🔨FEATURE_ENGINEERING_

In [ ]:
from sklearn.feature_selection import mutual_info_regression
def make_mi_score(x,y):
    x = x.copy()
    for colname in x.select_dtypes(['object','category']):
        x[colname], _ = x[colname].factorize()
        
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in x.dtypes]
    mi_score = mutual_info_regression(x,y,discrete_features=discrete_features,random_state=0)
    mi_score = pd.Series(mi_score,name='Mutual Information Score',index=x.columns)
    mi_score = mi_score.sort_values(ascending=False)
    return mi_score
In [ ]:
x = df.copy()
y = x.pop('cost')

mi_score = make_mi_score(x,y)
mi_score
Out[ ]:
promotion_name                 3.776
store_sqft                     2.734
grocery_sqft                   2.732
frozen_sqft                    2.731
meat_sqft                      2.730
store_city                     2.715
media_type                     2.541
store_state                    1.924
store_type                     1.156
sales_country                  0.819
florist                        0.664
prepared_food                  0.615
salad_bar                      0.612
coffee_bar                     0.603
video_store                    0.583
avg. yearly_income_no_dollar   0.099
avg. yearly_income             0.099
unit_sales(in millions)        0.074
total_children                 0.073
num_children_at_home           0.070
avg_cars_at home(approx)       0.062
avg_cars_at home(approx).1     0.059
education                      0.055
occupation                     0.053
member_card                    0.041
gender                         0.017
houseowner                     0.014
marital_status                 0.012
unit_sales                     0.010
store_cost(in millions)        0.002
units_per_case                 0.002
food_category                  0.000
low_fat                        0.000
recyclable_package             0.000
net_weight                     0.000
gross_weight                   0.000
food_department                0.000
brand_name                     0.000
store_sales(in millions)       0.000
food_family                    0.000
SRP                            0.000
Name: Mutual Information Score, dtype: float64
In [ ]:
def plot_mi_score(score):
    score = score.sort_values(ascending = True)
    width = np.arange(len(score))
    ticks = list(score.index)
    plt.figure(figsize=(14,14))
    plt.barh(width,score)
    plt.yticks(width,ticks)
    plt.title('Mutual Information Score')

plot_mi_score(mi_score)

LABLE_ENCODING¶

In [ ]:
#["food_category",'food_department','food_family','promotion_name','sales_country','marital_status','gender','education','member_card','occupation','houseowner','avg. yearly_income','brand_name','store_type','store_city','store_state','media_type']
food_category_encoder = LabelEncoder()
df['food_category']= food_category_encoder.fit_transform(df['food_category'])

food_department_encoder = LabelEncoder()
df['food_department']= food_department_encoder.fit_transform(df['food_department'])

food_family_encoder = LabelEncoder()
df['food_family']= food_family_encoder.fit_transform(df['food_family'])

promotion_name_encoder = LabelEncoder()
df['promotion_name']= promotion_name_encoder.fit_transform(df['promotion_name'])

sales_country_encoder = LabelEncoder()
df['sales_country']= sales_country_encoder.fit_transform(df['sales_country'])

marital_status_encoder = LabelEncoder()
df['marital_status']= marital_status_encoder.fit_transform(df['marital_status'])

gender_encoder = LabelEncoder()
df['gender']= gender_encoder.fit_transform(df['gender'])

education_encoder = LabelEncoder()
df['education']= education_encoder.fit_transform(df['education'])

member_card_encoder = LabelEncoder()
df['member_card']= member_card_encoder.fit_transform(df['member_card'])

occupation_encoder = LabelEncoder()
df['occupation']= occupation_encoder.fit_transform(df['occupation'])

houseowner_encoder = LabelEncoder()
df['houseowner']= houseowner_encoder.fit_transform(df['houseowner'])

yearly_income_encoder = LabelEncoder()
df['avg. yearly_income_no_dollar']= yearly_income_encoder.fit_transform(df['avg. yearly_income_no_dollar'])

brand_name_encoder = LabelEncoder()
df['brand_name']= brand_name_encoder.fit_transform(df['brand_name'])

store_type_encoder = LabelEncoder()
df['store_type']= store_type_encoder.fit_transform(df['store_type'])

store_city_encoder = LabelEncoder()
df['store_city']= store_city_encoder.fit_transform(df['store_city'])

store_state_encoder = LabelEncoder()
df['store_state']= store_state_encoder.fit_transform(df['store_state'])

media_type_encoder = LabelEncoder()
df['media_type']= media_type_encoder.fit_transform(df['media_type'])
#final_data = encoder.fit_transform(df.drop(columns='cost'))
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60428 entries, 0 to 60427
Data columns (total 42 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   food_category                 60428 non-null  int64  
 1   food_department               60428 non-null  int64  
 2   food_family                   60428 non-null  int64  
 3   store_sales(in millions)      60428 non-null  float64
 4   store_cost(in millions)       60428 non-null  float64
 5   unit_sales(in millions)       60428 non-null  float64
 6   promotion_name                60428 non-null  int64  
 7   sales_country                 60428 non-null  int64  
 8   marital_status                60428 non-null  int64  
 9   gender                        60428 non-null  int64  
 10  total_children                60428 non-null  float64
 11  education                     60428 non-null  int64  
 12  member_card                   60428 non-null  int64  
 13  occupation                    60428 non-null  int64  
 14  houseowner                    60428 non-null  int64  
 15  avg_cars_at home(approx)      60428 non-null  float64
 16  avg. yearly_income            60428 non-null  object 
 17  num_children_at_home          60428 non-null  float64
 18  avg_cars_at home(approx).1    60428 non-null  float64
 19  brand_name                    60428 non-null  int64  
 20  SRP                           60428 non-null  float64
 21  gross_weight                  60428 non-null  float64
 22  net_weight                    60428 non-null  float64
 23  recyclable_package            60428 non-null  float64
 24  low_fat                       60428 non-null  float64
 25  units_per_case                60428 non-null  float64
 26  store_type                    60428 non-null  int64  
 27  store_city                    60428 non-null  int64  
 28  store_state                   60428 non-null  int64  
 29  store_sqft                    60428 non-null  float64
 30  grocery_sqft                  60428 non-null  float64
 31  frozen_sqft                   60428 non-null  float64
 32  meat_sqft                     60428 non-null  float64
 33  coffee_bar                    60428 non-null  float64
 34  video_store                   60428 non-null  float64
 35  salad_bar                     60428 non-null  float64
 36  prepared_food                 60428 non-null  float64
 37  florist                       60428 non-null  float64
 38  media_type                    60428 non-null  int64  
 39  cost                          60428 non-null  float64
 40  avg. yearly_income_no_dollar  60428 non-null  int64  
 41  unit_sales                    60428 non-null  float64
dtypes: float64(24), int64(17), object(1)
memory usage: 19.4+ MB

Train Test Split¶

In [ ]:
x = df.drop(columns=['cost',"unit_sales","avg. yearly_income","SRP","food_category","low_fat","net_weight","food_department","store_cost(in millions)","brand_name","gross_weight","net_weight"])
y = df['cost']
In [ ]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
In [ ]:
x_train.shape
Out[ ]:
(42299, 31)

Modeling¶

Linear Regression¶

In [ ]:
lr = LinearRegression()
lr.fit(x_train,y_train)
print('Attempting to fit Linear Regressor')
Attempting to fit Linear Regressor
In [ ]:
y_pred_val_lr = lr.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_lr))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_lr))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_lr)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_lr))
print("\n")
MAE on Validation set : 25.446287942525657


MSE on Validation set : 876.016344656049


RMSE on Validation set : 5.044431379504102


R2 Score on Validation set : 0.03246551904513406


Support Vector Regressor¶

In [ ]:
svm = SVR()
svm.fit(x_train,y_train)
print('Attempting to fit Support Vector Regressor')
Attempting to fit Support Vector Regressor
In [ ]:
y_pred_val_svm = svm.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_svm))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_svm))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_svm)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_svm))
print("\n")
MAE on Validation set : 25.430824008612543


MSE on Validation set : 888.7690687910297


RMSE on Validation set : 5.042898373813669


R2 Score on Validation set : 0.018380507501720822


Decision Tree Regressor¶

In [ ]:
dc = DecisionTreeRegressor(random_state = 0)
dc.fit(x_train,y_train)
print('Attempting to fit Decision Tree Regressor')
Attempting to fit Decision Tree Regressor
In [ ]:
y_pred_val_dc = dc.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_dc))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_dc))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_dc)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_dc))
print("\n")
MAE on Validation set : 0.04107231507548138


MSE on Validation set : 1.5291222902531858


RMSE on Validation set : 0.20266305799400489


R2 Score on Validation set : 0.9983111290668926


Random Forest Regressor¶

In [ ]:
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
print('Attempting to fit Random Forest Regressor')
Attempting to fit Random Forest Regressor
In [ ]:
y_pred_val_rf = rf.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_rf))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_rf))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_rf)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_rf))
print("\n")
MAE on Validation set : 0.05551054663813577


MSE on Validation set : 0.7075218580258164


RMSE on Validation set : 0.2356067627173205


R2 Score on Validation set : 0.9992185627610202


In [ ]:
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(rf,x_train,y_train,cv=10,n_jobs=-1)
print('Accuracy: {:.2f} %'.format(cvs.mean()*100))
print('Standard Deviation: {:.2f} %'.format(cvs.std()*100))
Accuracy: 99.91 %
Standard Deviation: 0.03 %
In [ ]:
plt.scatter(y_test,y_pred_val_rf)
Out[ ]:
<matplotlib.collections.PathCollection at 0x7f0a7ce3e910>

MLP Regressor¶

In [ ]:
mlp = MLPRegressor(max_iter = 300)
mlp.fit(x_train, y_train)
print('Attempting to fit MLP Regressor')
Attempting to fit MLP Regressor
In [ ]:
y_pred_val_mlp = mlp.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_mlp))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_mlp))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_mlp)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_mlp))
print("\n")
MAE on Validation set : 28.533542649786032


MSE on Validation set : 1161.6801167679357


RMSE on Validation set : 5.341679759194296


R2 Score on Validation set : -0.283041778465853


Gradient Boosting Regressor¶

In [ ]:
gbc = GradientBoostingRegressor()
gbc.fit(x_train, y_train)
print('Attempting to fit Gradient Boosting Regressor')
Attempting to fit Gradient Boosting Regressor
In [ ]:
y_pred_val_gbc = gbc.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_gbc))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_gbc))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_gbc)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_gbc))
print("\n")
MAE on Validation set : 16.177236536913203


MSE on Validation set : 391.49473158977816


RMSE on Validation set : 4.022093551486987


R2 Score on Validation set : 0.5676054970481137


Light GBM Regressor¶

In [ ]:
lgbm = LGBMRegressor()
lgbm.fit(x_train, y_train)
print('Attempting to fit Light GBM Regressor')
Attempting to fit Light GBM Regressor
In [ ]:
y_pred_val_lgbm = lgbm.predict(x_test)
print('MAE on Validation set :',metrics.mean_absolute_error(y_test, y_pred_val_lgbm))
print("\n")
print('MSE on Validation set :',metrics.mean_squared_error(y_test, y_pred_val_lgbm))
print("\n")
print('RMSE on Validation set :',np.sqrt(metrics.mean_absolute_error(y_test, y_pred_val_lgbm)))
print("\n")
print('R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_lgbm))
print("\n")
MAE on Validation set : 3.281731775269041


MSE on Validation set : 21.739774523220383


RMSE on Validation set : 1.8115550710008905


R2 Score on Validation set : 0.9759890536429906


In [ ]:
print('Decision Tree Regressor R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_dc))
print('Random Forest Regressor R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_rf))
print('Light GBM R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_lgbm))
print('Logistic Regression R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_lr))
print('SVR R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_svm))
print('MLP Regressor R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_mlp))
print('Gradient Boosting R2 Score on Validation set :',metrics.r2_score(y_test, y_pred_val_gbc))
Decision Tree Regressor R2 Score on Validation set : 0.9983111290668926
Random Forest Regressor R2 Score on Validation set : 0.9992185627610202
Light GBM R2 Score on Validation set : 0.9759890536429906
Logistic Regression R2 Score on Validation set : 0.03246551904513406
SVR R2 Score on Validation set : 0.018380507501720822
MLP Regressor R2 Score on Validation set : -0.283041778465853
Gradient Boosting R2 Score on Validation set : 0.5676054970481137